##Install Libraries

library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.5.0 
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(tidyr)
library(ggplot2)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## 
## The following object is masked from 'package:Matrix':
## 
##     mean
## 
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     stat
## 
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## 
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(patchwork)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## 
## The following object is masked from 'package:mosaic':
## 
##     cnorm
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:mosaic':
## 
##     do
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

##Import Data

setwd("/Users/lorenzo_dube/Desktop/STA380/Final")

raw_listings = read_csv('austin_airbnb_listings.csv', show_col_types = FALSE)
raw_zip = read_csv('inventory_core_metrics_zip.csv', show_col_types = FALSE)

Data Cleaning

#Create df with relevant columns from raw data
df_listings <- raw_listings[, c('id',
                                'host_is_superhost',
                                'host_identity_verified',
                                'host_neighbourhood',
                                'latitude',
                                'longitude',
                                'neighbourhood_cleansed',
                                'property_type', 
                                'accommodates',
                                'bedrooms',
                                'beds',
                                'bathrooms_text',
                                'number_of_reviews',
                                'review_scores_rating',
                                'price')]

#Clean Price Column Data, remove $ and cast as numeric
df_listings$price <- as.numeric(gsub("\\$", "", df_listings$price))
## Warning: NAs introduced by coercion
#Clean Bathroom Data
df_listings <- df_listings %>%
  mutate(shared_bath = case_when(
    grepl(pattern = 'shared', x = bathrooms_text, ignore.case = TRUE) ~ 1))
df_listings$shared_bath <- replace_na(df_listings$shared_bath, replace = 0) #Replace NA values 


df_listings <- df_listings %>% rename('bathrooms' = 'bathrooms_text')
df_listings$bathrooms <- as.numeric(gsub("[^0-9.-]", "", df_listings$bathrooms))
## Warning: NAs introduced by coercion
#Rename columns
df_listings <- df_listings %>% rename('zip_code' = 'neighbourhood_cleansed',
                                      'listing_id' = 'id')


nrow(df_listings)
## [1] 18337
head(df_listings)
## # A tibble: 6 × 16
##   listing_id host_is_s…¹ host_…² host_…³ latit…⁴ longi…⁵ zip_c…⁶ prope…⁷ accom…⁸
##        <dbl> <lgl>       <lgl>   <chr>     <dbl>   <dbl>   <dbl> <chr>     <dbl>
## 1       5456 TRUE        TRUE    East D…    30.3   -97.7   78702 Entire…       3
## 2       5769 FALSE       TRUE    SW Wil…    30.5   -97.8   78729 Privat…       2
## 3   46856014 TRUE        TRUE    <NA>       30.8   -98.4   78611 Entire…       4
## 4     319887 FALSE       TRUE    East D…    30.3   -97.7   78702 Entire…       5
## 5     319894 TRUE        TRUE    <NA>       30.3   -97.6   78724 Entire…       8
## 6       6413 TRUE        TRUE    Travis…    30.2   -97.7   78704 Entire…       2
## # … with 7 more variables: bedrooms <dbl>, beds <dbl>, bathrooms <dbl>,
## #   number_of_reviews <dbl>, review_scores_rating <dbl>, price <dbl>,
## #   shared_bath <dbl>, and abbreviated variable names ¹​host_is_superhost,
## #   ²​host_identity_verified, ³​host_neighbourhood, ⁴​latitude, ⁵​longitude,
## #   ⁶​zip_code, ⁷​property_type, ⁸​accommodates

##Extracting Relevant Amenities

#Extract relevant amenities from raw data set 

#Pool 
pool <- (case_when(grepl(pattern= 'pool', x= raw_listings$amenities, ignore.case = TRUE) ~ 1))
pool <- replace_na(pool, replace = 0)   #Replace NA values with 0

#Fireplace
fireplace <- case_when(grepl(pattern ='fireplace', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
fireplace <- replace_na(fireplace, replace = 0)

#Jacuzzi 
jacuzzi <- case_when(grepl(pattern= 'jacuzzi', x= raw_listings$amenities, ignore.case = TRUE) ~ 1,
                      grepl(pattern= 'Hot Tub', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
jacuzzi <- replace_na(jacuzzi, replace = 0)

#Self Check-in
self_checkin <- case_when(grepl(pattern='Check-in', x= raw_listings$amenities, ignore.case = TRUE) ~ 1,
                          grepl(pattern='Check in', x= raw_listings$amenities, ignore.case = TRUE) ~ 1,
                          grepl(pattern='Key Pad', x= raw_listings$amenities, ignore.case = TRUE) ~ 1,
                          grepl(pattern='Keypad', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
self_checkin <- replace_na(self_checkin, replace = 0)


#Parking on Premises
parking <- case_when(grepl(pattern= 'Premises', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
parking <- replace_na(parking, replace = 0)


#Lake Access
lake_access <- case_when(grepl(pattern='Lake access', x=raw_listings$amenities, ignore.case = TRUE) ~ 1)
lake_access <- replace_na(lake_access, replace = 0)

#Sauna
sauna <- case_when(grepl(pattern= 'Sauna', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
sauna <- replace_na(sauna, replace = 0)

#Pets Allowed
pets_allowed <- case_when(grepl(pattern= 'Pets Allowed', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
pets_allowed <- replace_na(pets_allowed, replace = 0)

#Washer & Dryer
washer_dryer <- case_when(grepl(pattern= 'Washer', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
washer_dryer <- replace_na(washer_dryer, replace = 0)


#Gym
gym <- case_when(grepl(pattern= 'Gym', x= raw_listings$amenities, ignore.case = TRUE) ~ 1)
gym <- replace_na(gym, replace = 0)

amenities <- data.frame(pool, fireplace, jacuzzi, self_checkin, parking, lake_access,
                        sauna, pets_allowed, washer_dryer, gym)


#Bind Amenities to main dataframe
df_listings <- cbind(df_listings, amenities)


#Convert Boolean Values to 1 and 0
df_listings$host_is_superhost <- as.integer(df_listings$host_is_superhost)
df_listings$host_identity_verified <- as.integer(df_listings$host_identity_verified)

##Adding Zip Codes to Working Data

#Zip_Codes Dataframe
df_zip <- raw_zip[, c('postal_code','median_listing_price_per_square_foot','total_listing_count' )] %>%
  rename('median_price_SQFT' = 'median_listing_price_per_square_foot', 'zip_code' = 'postal_code')


#Merge both dataframes
df_listings <- merge(x=df_listings, y=df_zip, by = "zip_code", all.x = TRUE)

nrow(df_listings)
## [1] 18337
head(df_listings)
##   zip_code   listing_id host_is_superhost host_identity_verified
## 1    76530 4.827064e+07                 1                      1
## 2    76530 5.108356e+07                 1                      0
## 3    76530 5.367016e+07                 1                      1
## 4    76530 5.730915e+17                 1                      1
## 5    76530 6.855803e+17                 0                      0
## 6    76530 4.023989e+07                 1                      1
##   host_neighbourhood latitude longitude property_type accommodates bedrooms
## 1               <NA> 30.79622 -97.43305  Entire place            9        2
## 2               <NA> 30.72849 -97.55681     Farm stay            9        3
## 3      Silva Terrace 30.73638 -97.57190   Entire home           16       23
## 4      Silva Terrace 30.73674 -97.57270   Entire home           16        5
## 5  Lakewood Addition 30.73303 -97.25860  Entire cabin            4        1
## 6               <NA> 30.79503 -97.42569  Entire place            7        3
##   beds bathrooms number_of_reviews review_scores_rating price shared_bath pool
## 1    4         2                51                 4.76   258           0    0
## 2    6         3                11                 5.00   307           0    0
## 3   26        11                 6                 5.00    NA           0    1
## 4    7         3                11                 4.64   484           0    1
## 5    2         1                 0                   NA   117           0    0
## 6    5         2               138                 4.88   234           0    0
##   fireplace jacuzzi self_checkin parking lake_access sauna pets_allowed
## 1         0       0            1       1           0     0            0
## 2         1       0            1       1           0     0            0
## 3         0       0            0       1           0     0            0
## 4         0       0            0       1           0     0            0
## 5         1       0            0       1           0     0            0
## 6         0       0            0       1           0     0            0
##   washer_dryer gym median_price_SQFT total_listing_count
## 1            1   0               304                  11
## 2            1   0               304                  11
## 3            1   0               304                  11
## 4            1   0               304                  11
## 5            0   0               304                  11
## 6            1   0               304                  11

##Removing NA Values

#Remove any rows with N/A values
#df_listings <- df_listings %>% drop_na(accommodates, bathrooms, bedrooms, price, number_of_reviews, median_price_SQFT)
#nrow(df_listings)

df_listings <- df_listings %>% drop_na(accommodates, bathrooms, bedrooms, price, number_of_reviews)
nrow(df_listings)
## [1] 16396

##Property Categorization Simplification

#Categorizes property types into 7 types of properties (listed in the output)

df_listings <- df_listings %>% 
  mutate(property_type_clean = case_when(str_detect(property_type, 'Private room') ~ 'private_room',
                                   str_detect(property_type, 'Room') ~ 'private_room',
                                   str_detect(property_type, 'Entire guest suite') ~ 'private_room',
                                   str_detect(property_type, 'Shared room') ~ 'shared_room',
                                   str_detect(property_type, 'Boat') ~ 'camper_rv_boat',
                                   str_detect(property_type, 'Camper/RV') ~ 'camper_rv_boat',
                                   str_detect(property_type, 'Houseboat') ~ 'camper_rv_boat',
                                   str_detect(property_type, 'Bus') ~ 'camper_rv_boat',
                                   str_detect(property_type, 'Tent') ~ 'outdoor',
                                   str_detect(property_type, 'Yurt') ~ 'outdoor',
                                   str_detect(property_type, 'Treehouse') ~ 'outdoor',
                                   str_detect(property_type, 'Tipi') ~ 'outdoor',
                                   str_detect(property_type, 'Campsite') ~ 'outdoor',
                                   str_detect(property_type, 'Cave') ~ 'outdoor',
                                   str_detect(property_type, 'Entire condo') ~ 'condo_apartment',
                                   str_detect(property_type, 'Entire loft') ~ 'condo_apartment',
                                   str_detect(property_type, 'Entire serviced apartment') ~ 'condo_apartment',
                                   str_detect(property_type, 'Entire rental unit') ~ 'condo_apartment',
                                   str_detect(property_type, 'Entire home/apt') ~ 'condo_apartment',
                                   str_detect(property_type, 'Entire') ~ 'house',
                                   str_detect(property_type, 'Tiny home') ~ 'house',
                                   TRUE ~ 'other'))

table(df_listings$property_type_clean)
## 
##  camper_rv_boat condo_apartment           house           other         outdoor 
##             187            4264            8221             170             122 
##    private_room     shared_room 
##            3299             133

##Downtown Categorization

#Categorizes proximity of zip code to downtown
#0: Not Downtown or Downtown adjacent
#1: Downtown Adjacent 
#2: Actual Downtown

df_listings <- df_listings %>% 
  mutate(downtown = case_when(zip_code == 78701 ~ 2,
                              zip_code == 78712 ~ 1,
                              zip_code == 78702 ~ 1, 
                              zip_code == 78703 ~ 1, 
                              zip_code == 78704 ~ 1,
                              TRUE ~ 0))

 table(df_listings$downtown)     
## 
##     0     1     2 
## 11790  3650   956

##Cleaning Host Verificaiton

#Sets any NA values in host_identity_verified to 0 (assumption is that N/A also means that the identity was not verified)
df_listings$host_identity_verified[is.na(df_listings$host_identity_verified)] <- 0

##Exploratory Data Analysis Looking at the summary statistics of a few variables (price, accomodates, bathrooms, and bedrooms), we can see that the values make sense. It is worthy to note that the spread of the values from q95 to max is significant. This indicates that we have outliers which could later impact our regression analysis and prediction model.

#Looks at the quantiles, medians, and means of price, accomodates, bathrooms, and bedrooms variables

df.sum <- df_listings %>%
  select(price, accommodates, bathrooms, bedrooms) %>% # select variables to summarise
  summarise_each(funs(min = min, 
                      q25 = quantile(., 0.25), 
                      median = median, 
                      q75 = quantile(., 0.75),
                      q95 = quantile(., 0.95),  
                      max = max,
                      mean = mean, 
                      sd = sd))
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `across()` instead.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <]8;;https://github.com/tidyverse/dplyr/issueshttps://github.com/tidyverse/dplyr/issues]8;;>.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
df.stats.tidy <- df.sum %>% gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  select(var, min, q25, median, q75, q95, max, mean, sd)
## Warning: attributes are not identical across measure variables;
## they will be dropped
print(df.stats.tidy)
##            var min q25 median q75 q95 max       mean          sd
## 1 accommodates   1   2      4   6  12  16   4.993718   3.1947879
## 2    bathrooms   0   1      1   2   3   8   1.632959   0.8153995
## 3     bedrooms   1   1      2   3   4  12   2.014943   1.1872901
## 4        price   1 100    165 275 604 999 220.315504 178.0958600
print(paste('Total Data Rows: ', nrow(df_listings)))
## [1] "Total Data Rows:  16396"

Focusing on price, 2 standard deviations above the mean is 578. Note that q95 of price is 604 meaning that atleast 5% of the 16396 data points are above 2 standard deviations of the mean.

Now we compare look at the same variables but we group by property type. First, we see most of the property types are apartment/condos, houses, and private rooms. We also see differences in the median/mean and the quantile values for all property types.

df.sum.grouped <- df_listings %>%
  group_by(property_type_clean) %>%
  select(price, accommodates, bathrooms, bedrooms) %>% # select variables to summarise
  summarise_each(funs(min = min, 
                      q25 = quantile(., 0.25), 
                      median = median, 
                      q75 = quantile(., 0.75),
                      q95 = quantile(., 0.95),  
                      max = max,
                      mean = mean, 
                      sd = sd,
                      n=n()))
## Adding missing grouping variables: `property_type_clean`
t(df.sum.grouped)
##                     [,1]             [,2]              [,3]        [,4]       
## property_type_clean "camper_rv_boat" "condo_apartment" "house"     "other"    
## price_min           "25"             "20"              " 1"        "42"       
## accommodates_min    "1"              "1"               "1"         "1"        
## bathrooms_min       "0"              "0"               "0"         "1"        
## bedrooms_min        "1"              "1"               "1"         "1"        
## price_q25           " 85.00"         "104.00"          "150.00"    "132.25"   
## accommodates_q25    "2.00"           "3.00"            "4.00"      "3.25"     
## bathrooms_q25       "1"              "1"               "1"         "1"        
## bedrooms_q25        "1"              "1"               "2"         "1"        
## price_median        "100.0"          "150.0"           "219.0"     "200.0"    
## accommodates_median "3"              "4"               "6"         "5"        
## bathrooms_median    "1"              "1"               "2"         "1"        
## bedrooms_median     "1"              "1"               "3"         "2"        
## price_q75           "137.5"          "236.0"           "350.0"     "306.0"    
## accommodates_q75    "4"              "5"               "8"         "9"        
## bathrooms_q75       "1.0"            "2.0"             "2.5"       "2.0"      
## bedrooms_q75        "1"              "2"               "3"         "3"        
## price_q95           "227.3"          "465.0"           "739.0"     "600.0"    
## accommodates_q95    " 8.0"           " 8.0"            "14.0"      "15.1"     
## bathrooms_q95       "2.0"            "2.0"             "3.5"       "3.0"      
## bedrooms_q95        "2"              "3"               "5"         "5"        
## price_max           "679"            "999"             "999"       "950"      
## accommodates_max    "14"             "16"              "16"        "16"       
## bathrooms_max       "5.5"            "7.0"             "8.0"       "7.5"      
## bedrooms_max        " 5"             " 7"              "11"        " 8"       
## price_mean          "123.52941"      "193.49250"       "284.66719" "249.05294"
## accommodates_mean   "3.679144"       "4.202158"        "6.575477"  "6.305882" 
## bathrooms_mean      "1.114973"       "1.352955"        "1.986316"  "1.635294" 
## bedrooms_mean       "1.283422"       "1.497889"        "2.685196"  "2.164706" 
## price_sd            " 85.85761"      "132.25487"       "195.72190" "162.17064"
## accommodates_sd     "2.166148"       "1.996933"        "3.302517"  "3.995629" 
## bathrooms_sd        "0.4506750"      "0.5422135"       "0.8904201" "0.9711466"
## bedrooms_sd         "0.6220786"      "0.6887012"       "1.2122293" "1.3918364"
## price_n             " 187"           "4264"            "8221"      " 170"     
## accommodates_n      " 187"           "4264"            "8221"      " 170"     
## bathrooms_n         " 187"           "4264"            "8221"      " 170"     
## bedrooms_n          " 187"           "4264"            "8221"      " 170"     
##                     [,5]        [,6]           [,7]         
## property_type_clean "outdoor"   "private_room" "shared_room"
## price_min           "18"        "10"           "15"         
## accommodates_min    "1"         "1"            "1"          
## bathrooms_min       "0"         "0"            "0"          
## bedrooms_min        "1"         "1"            "1"          
## price_q25           "120.00"    " 50.00"       " 25.00"     
## accommodates_q25    "2.00"      "2.00"         "1.00"       
## bathrooms_q25       "1"         "1"            "1"          
## bedrooms_q25        "1"         "1"            "1"          
## price_median        "235.5"     " 70.0"        " 35.0"      
## accommodates_median "3"         "2"            "1"          
## bathrooms_median    "1"         "1"            "1"          
## bedrooms_median     "1"         "1"            "1"          
## price_q75           "335.0"     "115.0"        " 75.0"      
## accommodates_q75    "4"         "2"            "2"          
## bathrooms_q75       "1.0"       "1.0"          "1.0"        
## bedrooms_q75        "1"         "1"            "1"          
## price_q95           "479.0"     "275.0"        "256.0"      
## accommodates_q95    " 6.0"      " 4.0"         " 4.0"       
## bathrooms_q95       "2.0"       "2.0"          "4.0"        
## bedrooms_q95        "2"         "2"            "1"          
## price_max           "950"       "999"          "700"        
## accommodates_max    "16"        "16"           " 8"         
## bathrooms_max       "4.0"       "7.5"          "4.0"        
## bedrooms_max        "12"        " 8"           " 1"         
## price_mean          "243.81967" "103.93271"    " 67.16541"  
## accommodates_mean   "3.729508"  "2.264019"     "1.639098"   
## bathrooms_mean      "1.000000"  "1.177781"     "1.364662"   
## bedrooms_mean       "1.336066"  "1.112761"     "1.000000"   
## price_sd            "160.45916" "100.56662"    "103.25323"  
## accommodates_sd     "2.790293"  "1.334689"     "1.130422"   
## bathrooms_sd        "0.5182616" "0.4360156"    "0.9557002"  
## bedrooms_sd         "1.3582357" "0.4164786"    "0.0000000"  
## price_n             " 122"      "3299"         " 133"       
## accommodates_n      " 122"      "3299"         " 133"       
## bathrooms_n         " 122"      "3299"         " 133"       
## bedrooms_n          " 122"      "3299"         " 133"

Looking at the typical number of reviews listings generally have. We see most listings have few ratings (more than 25% of listings have 0 or 1 rating). There seem to be some outliers that have an extremely high number of ratings (most likely correlated with the variable host_is_superhost)

df_listings %>%
  summarize(min = min(number_of_reviews),
            q25 = quantile(number_of_reviews, 0.25),
            median = median(number_of_reviews),
            q75 = quantile(number_of_reviews, 0.75),
            q95 = quantile(number_of_reviews, 0.95),
            max = max(number_of_reviews),
            mean = mean(number_of_reviews),
            sd = sd(number_of_reviews)
          )
##   min q25 median q75 q95  max     mean       sd
## 1   0   1      9  36 164 1062 34.95109 69.83448

Histograms of the variables of interest show that all of our variables are skewed right (except review_score_rating which is skewed left). This shows that the majority of our data points are concentrated on the lower end of the values that our variables could take (such as over 50% of our data points only have 1 bathroom). Also worthy of note is that prices exhibit discontinuous behavior. We guess that this is because people who are listing tend to choose prices that are round numbers of close to round numbers such as prices close to 150, 175, 200. The spikes in the count of prices tend to occur in increments of 25.

hist_price = ggplot(df_listings) + 
  geom_histogram(aes(x = price), bins=75)

hist_accommodates = ggplot(df_listings) + 
  geom_histogram(aes(x = accommodates), bins=16)

hist_bedrooms = ggplot(df_listings) + 
  geom_histogram(aes(x = bedrooms), bins=11)

hist_bathrooms = ggplot(df_listings) + 
  geom_histogram(aes(x = bathrooms), bins=16)

hist_num_reviews = ggplot(df_listings) + 
  geom_histogram(aes(x = number_of_reviews), bins=30)

hist_review_scores = ggplot(df_listings) + 
  geom_histogram(aes(x = review_scores_rating), bins=30)

hist_price / hist_accommodates

hist_bedrooms / hist_bathrooms

hist_num_reviews / hist_review_scores
## Warning: Removed 2958 rows containing non-finite values (`stat_bin()`).

Observations for all Boxplots: Although the majority of listings seem to be concentrated below the $250 price point there seems to be a listing almost at every price point for bedrooms, bathrooms, acommodates. This is an indication that some other factor is driving the price of a listing, could be location or ammenities such as a pool, jacuzzi, etc.

As expected, the median price seems to increase with the number of bedrooms and bathrooms and the relationship seems approximately linear. However, note that once we reach more than 7 bedrooms or more than 5 bathrooms, we see significantly fewer data points which could bias our results.

#Boxplot and Scatterplot for price vs (bedrooms and bathrooms)

box1 = ggplot(df_listings) + 
  geom_boxplot(aes(x = factor(bedrooms), y = price)) 

p1 = ggplot(df_listings) + 
  geom_point(aes(x = bedrooms, y = price)) 

box2 = ggplot(df_listings) + 
  geom_boxplot(aes(x = factor(bathrooms), y = price))

p2 = ggplot(df_listings) + 
  geom_point(aes(x = bathrooms, y = price))

box1 / p1

box2 / p2

We can see that the median price also seems to increase with the number of people a listing can accomodate. The relationship also seems somewhat linear.

#Boxplot and Scatterplot for price vs accomodates

box3 = ggplot(df_listings) + 
  geom_boxplot(aes(x = factor(accommodates), y = price)) 

p3 = ggplot(df_listings) + 
  geom_point(aes(x = accommodates, y = price))

box3 / p3

#Boxplot and Scatterplot for price vs property_type_cleaned

box4 = ggplot(df_listings) + 
  geom_boxplot(aes(x = factor(property_type_clean), y = price)) +
  theme(axis.text.x = element_text(angle = 45))

p4 = ggplot(df_listings) + 
  geom_point(aes(x = factor(property_type_clean), y = price)) +
  theme(axis.text.x = element_text(angle = 45))

box4 / p4

As expected, as properties get close to downtown, prices tend to increase. Interesting here is that although downtown listings have higher median prices, it has less values for prices on the very high end potnentially due to the limited size of listings downtown or lower number of observations compared to the other locations.

#Boxplot and Scatterplot for price vs downtown
#0: Not Downtown or Downtown Adjacent 
#1: Downtown Adjacent
#2: Actual Downtown

box5 = ggplot(df_listings) + 
  geom_boxplot(aes(x = factor(downtown), y = price)) 

p5 = ggplot(df_listings) + 
  geom_point(aes(x = factor(downtown), y = price)) 

box5 / p5

table(df_listings$downtown) 
## 
##     0     1     2 
## 11790  3650   956
#Scatterplot for price vs (number_of_reviews and review_scores_rating)

p6 = ggplot(df_listings) + 
  geom_point(aes(x = number_of_reviews, y = price)) 

p7 = ggplot(df_listings) + 
  geom_point(aes(x = review_scores_rating, y = price)) 

p6 / p7
## Warning: Removed 2958 rows containing missing values (`geom_point()`).

#Scatterplot for price vs (number_of_reviews and review_scores_rating)

p6 = ggplot(df_listings) + 
  geom_point(aes(x = number_of_reviews, y = price)) 

p7 = ggplot(df_listings) + 
  geom_point(aes(x = review_scores_rating, y = price)) 

p6 / p7
## Warning: Removed 2958 rows containing missing values (`geom_point()`).

#Correlation Matrix for Reference (note: we are not choosing variables for regression based on this) 
#Only for referece to look at potentially odd correlations

price_cor <- data.frame(cor(df_listings[,unlist(lapply(df_listings, is.numeric))]))[13,]

price_cor <- t(price_cor)
price_cor <- na.omit(price_cor)
price_cor <- data.frame(price_cor)

price_cor %>% arrange(desc(price))
##                                price
## price                   1.0000000000
## accommodates            0.6087882100
## bedrooms                0.5956788842
## bathrooms               0.5856397530
## downtown                0.1779432231
## jacuzzi                 0.1735864479
## fireplace               0.1408768646
## pool                    0.1304871443
## washer_dryer            0.1272265271
## lake_access             0.1171017919
## pets_allowed            0.0597305837
## parking                 0.0360955896
## sauna                   0.0261331205
## gym                     0.0230591847
## host_is_superhost       0.0214546469
## latitude                0.0052875533
## self_checkin           -0.0003704018
## zip_code               -0.0267970206
## listing_id             -0.0361378431
## host_identity_verified -0.0543667023
## number_of_reviews      -0.0670189955
## longitude              -0.1657145269
## shared_bath            -0.2626263097
#Price vs. median_price_SQFT for small (1 person) and large (12 person) listings
df_listings %>% filter(accommodates == 1  | accommodates == 12) %>%
  ggplot(aes(x=median_price_SQFT, y=price)) + 
  geom_point(alpha = .8, aes(color=factor(accommodates))) + 
  scale_color_viridis_d() + 
  labs(x='Median $ per SQFT.', y='Price', color = 'Accommodates')
## Warning: Removed 1 rows containing missing values (`geom_point()`).

  #Use viridis color scheme
#Price vs. Airbnb Size (Accommodates) for most and least expensive zipcode
df_listings %>% 
  filter(median_price_SQFT == max(df_listings$median_price_SQFT) | median_price_SQFT == min(df_listings$median_price_SQFT)) %>%
    ggplot(aes(x=accommodates,y=price)) +
    geom_point(alpha = 0.7, aes(color=factor(median_price_SQFT))) + 
    scale_color_viridis_d() + 
    labs(x='Accommodates', y='Price', color= 'Median $ per SQFT.')

#Cooks Distance
acc_fit = lm(price ~ accommodates, data = df_listings)
bath_fit = lm(price ~ bathrooms, data = df_listings)
bedroom_fit = lm(price ~ bedrooms, data = df_listings)
beds_fit = lm(price ~ beds, data = df_listings)

#A a general rule of thumb is that any point with a cook's distance
#over 4/n is considered an outlier
4/nrow(df_listings)
## [1] 0.0002439619
plot(cooks.distance(bedroom_fit)) + abline(h=4/nrow(df_listings), col = 'red')

## integer(0)

##Functions for SLR Diagnostics

#########################################
#Simple Model Summary Function
#########################################
model_summaries <- function(data, xvar, yvar){
  xvar = enquo(arg = xvar)
  yvar = enquo(arg = yvar)
  #create new df to work from to not affect original data
  new.df <- data %>% 
    select(!!xvar, !!yvar) %>%
    mutate(observation = 1:nrow(data))
  name_list = names(new.df)

#linear
  linear.model = lm(unlist(new.df[2])~unlist(new.df[1]), data=new.df)
  print('Linear Model')
  print(summary(linear.model))
  print(' ')
  print(' ')
#squared
  squared.model = lm(unlist(new.df[2])~poly((unlist(new.df[1])), 2), data=new.df)
  print('Squared Model')
  print(summary(squared.model))
  print(' ')
  print(' ')
#cubic
  cubic.model = lm(unlist(new.df[2])~poly((unlist(new.df[1])), 3), data=new.df)
  print('Cubic Model')
  print(summary(cubic.model))
  print(' ')
  print(' ')
#spline
  print('Spline Model')
  spline.model = gam(unlist(new.df[2])~s(unlist(new.df[1])), data=new.df)
  print(summary(spline.model))
  print(' ')
  print(' ')
}


########################################
#Diagnostics Function for Linear Models
########################################
linear_diagnostics <- function(data, xvar, yvar){
  
  
  xvar = enquo(arg = xvar)
  yvar = enquo(arg = yvar)
  #create new df to work from to not affect original data
  new.df <- data %>% 
    select(!!xvar, !!yvar) %>%
    mutate(observation = 1:nrow(data))
  name_list = names(new.df)


  lm.model = lm(unlist(new.df[2])~unlist(new.df[1]), data=new.df)

  print(summary(lm.model))
  
  # Tack the residuals onto our dataset
  new.df = new.df %>% mutate(resids = resid(lm.model),
                       stdresids = rstandard(lm.model))
  
  # Linear Regression Visualization
  p <-ggplot(aes(x=unlist(new.df[1]), y=!!yvar), data=new.df) + 
    geom_point(alpha = 0.6) + 
    geom_smooth(method="lm") + xlab(name_list[1]) + ggtitle(paste0(name_list[1], ' Regressed to Price'))

  print(p)
  
  # Always plot residuals vs. predictor variable
  # We're looking for nonlinear trends -- indicating E(e_i|X) != 0 --
  # and for patterns in the spread indicating Var(e_i|X) isn't constant
  q <- ggplot(aes(x=unlist(new.df[1]), y=resids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Residuals vs ', name_list[1]))
  print(q)
  
  r <- ggplot(aes(x=unlist(new.df[1]), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Standardized Residuals vs ', name_list[1]))
  print(r)
  
   #Dated absolute residuals
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  
  # Plotting successive residuals against each other
  # head() and tail() here are used to get "every day except the last"
  # and "every day except the first", respectively
  # see help(head)
  lag_df = data.frame(resid.today =head(rstandard(lm.model),-1),
                      resid.tomorrow = tail(rstandard(lm.model),-1))
  
  print(cor(lag_df$resid.today, lag_df$resid.tomorrow))  
  
  y <- ggplot(aes(x=resid.today, y=resid.tomorrow), data=lag_df) +
    geom_point() + 
    geom_smooth()
  print(y)




  # Always plot residuals^2 vs.
  # predictor variable
  # It's easier to read off trends in the spread of the residuals here -- remember,
  # the expected residual should be zero so the level here should be constant
  s <- ggplot(aes(x=unlist(new.df[1]), y=resids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Squared Residuals vs ', name_list[1]))
  print(s)
  
  t <- ggplot(aes(x=unlist(new.df[1]), y=stdresids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Squared Standardized Residuals vs ', name_list[1]))
  print(t)
  
  # Those plots are sensitive to outliers, let's try absolute values
  u <- ggplot(aes(x=unlist(new.df[1]), y=abs(resids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(resids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Residuals vs ', name_list[1]))
  print(u)
  
  v <- ggplot(aes(x=unlist(new.df[1]), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(stdresids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Standardized Residuals vs ', name_list[1]))
  print(v)
  
  # Always plot distribution of residuals
  w <- ggplot(aes(x=stdresids), data=new.df) + 
    geom_histogram(aes(y=..density..), bins=30) + 
    stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col='blue') + ggtitle(paste0('Distribution of Residuals from ', name_list[1]))
  print(w)
  
  ## ----qqplot-of-residuals, echo=FALSE-------------------------------------
  # An alternative: plot observed quantiles vs. theoretical Gaussian quantiles
  qqnorm(residuals(lm.model))
  qqline(residuals(lm.model))
  
  qqnorm(rstandard(lm.model))
  qqline(rstandard(lm.model))
}


##########################################
#Diagnostics Function for Squared Models
##########################################
quad_diagnostics <- function(data, xvar, yvar){
  
  
  xvar = enquo(arg = xvar)
  yvar = enquo(arg = yvar)
  #create new df to work from to not affect original data
  new.df <- data %>% 
    select(!!xvar, !!yvar) %>%
    mutate(observation = 1:nrow(data))
  name_list = names(new.df)


  lm.model = lm(unlist(new.df[2])~poly((unlist(new.df[1])), 2), data=new.df)

  print(summary(lm.model))
  
  # Tack the residuals onto our dataset
  new.df = new.df %>% mutate(resids = resid(lm.model),
                       stdresids = rstandard(lm.model))
  
  # Regression Visualization
  p <-ggplot(aes(x=unlist(new.df[1]), y=!!yvar), data=new.df) + 
    geom_point(alpha = 0.6) + 
    geom_smooth(method="lm", formula = y ~ poly(x, 2)) + xlab(name_list[1]) + ggtitle(paste0(name_list[1], ' Regressed to Price'))

  print(p)


  # Always plot residuals vs. predictor variable
  # We're looking for nonlinear trends -- indicating E(e_i|X) != 0 --
  # and for patterns in the spread indicating Var(e_i|X) isn't constant
  q <- ggplot(aes(x=unlist(new.df[1]), y=resids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Residuals vs ', name_list[1]))
  print(q)
  
  r <- ggplot(aes(x=unlist(new.df[1]), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Standardized Residuals vs ', name_list[1]))
  print(r)
  
   #Dated absolute residuals
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  
  # Plotting successive residuals against each other
  # head() and tail() here are used to get "every day except the last"
  # and "every day except the first", respectively
  # see help(head)
  lag_df = data.frame(resid.today =head(rstandard(lm.model),-1),
                      resid.tomorrow = tail(rstandard(lm.model),-1))

  print(cor(lag_df$resid.today, lag_df$resid.tomorrow))  
  
  y <- ggplot(aes(x=resid.today, y=resid.tomorrow), data=lag_df) +
    geom_point() + 
    geom_smooth()
  print(y)




  # Always plot residuals^2 vs.
  # predictor variable
  # It's easier to read off trends in the spread of the residuals here -- remember,
  # the expected residual should be zero so the level here should be constant
  s <- ggplot(aes(x=unlist(new.df[1]), y=resids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + geom_hline(aes(yintercept=mean(resids^2), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Squared Residuals vs ', name_list[1]))
  print(s)
  
  t <- ggplot(aes(x=unlist(new.df[1]), y=stdresids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + geom_hline(aes(yintercept=mean(stdresids^2), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Squared Standardized Residuals vs ', name_list[1]))
  print(t)
  
  # Those plots are sensitive to outliers, let's try absolute values
  u <- ggplot(aes(x=unlist(new.df[1]), y=abs(resids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(resids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Residuals vs ', name_list[1]))
  print(u)
  
  v <- ggplot(aes(x=unlist(new.df[1]), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(stdresids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Standardized Residuals vs ', name_list[1]))
  print(v)
  
  # Always plot distribution of residuals
  w <- ggplot(aes(x=stdresids), data=new.df) + 
    geom_histogram(aes(y=..density..), bins=30) + 
    stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col='blue') + ggtitle(paste0('Distribution of Residuals from ', name_list[1]))
  print(w)
  
  ## ----qqplot-of-residuals, echo=FALSE-------------------------------------
  # An alternative: plot observed quantiles vs. theoretical Gaussian quantiles
  qqnorm(residuals(lm.model))
  qqline(residuals(lm.model))
  
  qqnorm(rstandard(lm.model))
  qqline(rstandard(lm.model))
}

##########################################
#Diagnostics Function for Cubic Models
##########################################
cubic_diagnostics <- function(data, xvar, yvar){
  
  
  xvar = enquo(arg = xvar)
  yvar = enquo(arg = yvar)
  #create new df to work from to not affect original data
  new.df <- data %>% 
    select(!!xvar, !!yvar) %>%
    mutate(observation = 1:nrow(data))
  name_list = names(new.df)


  lm.model = lm(unlist(new.df[2])~poly((unlist(new.df[1])), 3), data=new.df)

  print(summary(lm.model))
  
  # Tack the residuals onto our dataset
  new.df = new.df %>% mutate(resids = resid(lm.model),
                       stdresids = rstandard(lm.model))
  
  # Regression Visualization
  p <-ggplot(aes(x=unlist(new.df[1]), y=!!yvar), data=new.df) + 
    #geom_point(alpha = 0.6) + 
    #geom_smooth(method="lm", formula = y ~ poly(x, 3)) + xlab(name_list[1]) + ggtitle(paste0(name_list[1], ' Regressed to Price'))

  print(p)

  
  # Always plot residuals vs. predictor variable
  # We're looking for nonlinear trends -- indicating E(e_i|X) != 0 --
  # and for patterns in the spread indicating Var(e_i|X) isn't constant
  q <- ggplot(aes(x=unlist(new.df[1]), y=resids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Residuals vs ', name_list[1]))
    print(q)
  
  r <- ggplot(aes(x=unlist(new.df[1]), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Standardized Residuals vs ', name_list[1]))
  print(r)
  
   #Dated absolute residuals
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  
  # Plotting successive residuals against each other
  # head() and tail() here are used to get "every day except the last"
  # and "every day except the first", respectively
  # see help(head)
  lag_df = data.frame(resid.today =head(rstandard(lm.model),-1),
                      resid.tomorrow = tail(rstandard(lm.model),-1))
  
  print(cor(lag_df$resid.today, lag_df$resid.tomorrow))

  y <- ggplot(aes(x=resid.today, y=resid.tomorrow), data=lag_df) +
    geom_point() + 
    geom_smooth()
  print(y)




  # Always plot residuals^2 vs.
  # predictor variable
  # It's easier to read off trends in the spread of the residuals here -- remember,
  # the expected residual should be zero so the level here should be constant
  s <- ggplot(aes(x=unlist(new.df[1]), y=resids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + geom_hline(aes(yintercept=mean(resids^2), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Squared Residuals vs ', name_list[1]))
  print(s)
  
  t <- ggplot(aes(x=unlist(new.df[1]), y=stdresids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + geom_hline(aes(yintercept=mean(stdresids^2), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Squared Standardized Residuals vs ', name_list[1]))
  print(t)
  
  # Those plots are sensitive to outliers, let's try absolute values
  u <- ggplot(aes(x=unlist(new.df[1]), y=abs(resids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(resids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Residuals vs ', name_list[1]))
  print(u)
  
  v <- ggplot(aes(x=unlist(new.df[1]), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(stdresids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Standardized Residuals vs ', name_list[1]))
  print(v)
  
  # Always plot distribution of residuals
  w <- ggplot(aes(x=stdresids), data=new.df) + 
    geom_histogram(aes(y=..density..), bins=30) + 
    stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col='blue') + ggtitle(paste0('Distribution of Residuals from ', name_list[1]))
  print(w)
  
  ## ----qqplot-of-residuals, echo=FALSE-------------------------------------
  # An alternative: plot observed quantiles vs. theoretical Gaussian quantiles
  qqnorm(residuals(lm.model))
  qqline(residuals(lm.model))
  
  qqnorm(rstandard(lm.model))
  qqline(rstandard(lm.model))
}

##########################################
#Diagnostics Function for Spline Models
##########################################
spline_diagnostics <- function(data, xvar, yvar){
  
  
  xvar = enquo(arg = xvar)
  yvar = enquo(arg = yvar)
  #create new df to work from to not affect original data
  new.df <- data %>% 
    select(!!xvar, !!yvar) %>%
    mutate(observation = 1:nrow(data))
  name_list = names(new.df)


  lm.model = gam(unlist(new.df[2])~s(unlist(new.df[1])), data=new.df)

  print(summary(lm.model))

  
  # Tack the residuals onto our dataset
  new.df = new.df %>% mutate(resids = residuals(lm.model, type = 'response'),
                       stdresids = (resids/sd(resids)))

  
  # Linear Regression Visualization
  p <-ggplot(aes(x=unlist(new.df[1]), y=!!yvar), data=new.df) + 
    geom_point(alpha = 0.6) + 
    geom_smooth(method="gam") + xlab(name_list[1]) + ggtitle(paste0(name_list[1], ' Regressed to Price'))

  print(p)
  
  # Always plot residuals vs. predictor variable
  # We're looking for nonlinear trends -- indicating E(e_i|X) != 0 --
  # and for patterns in the spread indicating Var(e_i|X) isn't constant
  q <- ggplot(aes(x=unlist(new.df[1]), y=resids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Residuals vs ', name_list[1]))
  print(q)
  
  r <- ggplot(aes(x=unlist(new.df[1]), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + xlab(name_list[1]) + ggtitle(paste0('Standardized Residuals vs ', name_list[1]))
  print(r)
  
   #Dated absolute residuals
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=stdresids), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  x <- ggplot(aes(x=as.Date(observation,origin="1993-12-31"), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(yintercept=0, col='red')
  print(x)
  
  # Plotting successive residuals against each other
  # head() and tail() here are used to get "every day except the last"
  # and "every day except the first", respectively
  # see help(head)
  lag_df = data.frame(resid.today =head(new.df$stdresids,-1),
                      resid.tomorrow = tail(new.df$stdresids,-1))
  
  print(cor(lag_df$resid.today, lag_df$resid.tomorrow))

  y <- ggplot(aes(x=resid.today, y=resid.tomorrow), data=lag_df) +
    geom_point() + 
    geom_smooth()
  print(y)




  # Always plot residuals^2 vs.
  # predictor variable
  # It's easier to read off trends in the spread of the residuals here -- remember,
  # the expected residual should be zero so the level here should be constant
  s <- ggplot(aes(x=unlist(new.df[1]), y=resids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + geom_hline(aes(yintercept=mean(resids^2), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Squared Residuals vs ', name_list[1]))
  print(s)
  
  t <- ggplot(aes(x=unlist(new.df[1]), y=stdresids^2), data=new.df) + 
    geom_point() + 
    geom_smooth() + geom_hline(aes(yintercept=mean(stdresids^2), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Squared Standardized Residuals vs ', name_list[1]))
  print(t)
  
  # Those plots are sensitive to outliers, let's try absolute values
  u <- ggplot(aes(x=unlist(new.df[1]), y=abs(resids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(resids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Residuals vs ', name_list[1]))
  print(u)
  
  v <- ggplot(aes(x=unlist(new.df[1]), y=abs(stdresids)), data=new.df) + 
    geom_point() + 
    geom_smooth() + 
    geom_hline(aes(yintercept=mean(abs(stdresids)), col='red')) + xlab(name_list[1]) + ggtitle(paste0('Absolute Value Standardized Residuals vs ', name_list[1]))
  print(v)
  
  # Always plot distribution of residuals
  w <- ggplot(aes(x=stdresids), data=new.df) + 
    geom_histogram(aes(y=..density..), bins=30) + 
    stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col='blue') + ggtitle(paste0('Distribution of Residuals from ', name_list[1]))
  print(w)
  
  ## ----qqplot-of-residuals, echo=FALSE-------------------------------------
  # An alternative: plot observed quantiles vs. theoretical Gaussian quantiles
  qqnorm(residuals(lm.model))
  qqline(residuals(lm.model))
  
  qqnorm(rstandard(lm.model))
  qqline(rstandard(lm.model))
}

##DF Numeric Only

num.df.listings <- df_listings[,unlist(lapply(df_listings, is.numeric))] %>% na.omit() %>% subset(beds<50) %>% select(c(beds, bedrooms, accommodates, number_of_reviews, review_scores_rating, price))
names(num.df.listings)
## [1] "beds"                 "bedrooms"             "accommodates"        
## [4] "number_of_reviews"    "review_scores_rating" "price"
num.column.names = c('accommodates', 'beds', 'bedrooms', 'median_price_SQFT', 'total_listing_count', 'number_of_reviews', 'review_scores_rating') 

##Simple Regression: Price v. Accommodates

model_summaries(num.df.listings, accommodates, price)
## [1] "Linear Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.77  -68.26  -23.83   38.78  891.78 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        38.4324     2.1582   17.81   <2e-16 ***
## unlist(new.df[1])  34.3962     0.3533   97.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133 on 13342 degrees of freedom
## Multiple R-squared:  0.4154, Adjusted R-squared:  0.4153 
## F-statistic:  9480 on 1 and 13342 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Squared Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 2), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -560.62  -68.76  -23.15   38.24  893.67 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.151 187.757   <2e-16 ***
## poly((unlist(new.df[1])), 2)1 12950.206    132.992  97.376   <2e-16 ***
## poly((unlist(new.df[1])), 2)2  -269.424    132.992  -2.026   0.0428 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133 on 13341 degrees of freedom
## Multiple R-squared:  0.4156, Adjusted R-squared:  0.4155 
## F-statistic:  4743 on 2 and 13341 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Cubic Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 3), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -562.29  -68.50  -23.14   37.86  893.86 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.151 187.751   <2e-16 ***
## poly((unlist(new.df[1])), 3)1 12950.206    132.996  97.373   <2e-16 ***
## poly((unlist(new.df[1])), 3)2  -269.424    132.996  -2.026   0.0428 *  
## poly((unlist(new.df[1])), 3)3    55.108    132.996   0.414   0.6786    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133 on 13340 degrees of freedom
## Multiple R-squared:  0.4156, Adjusted R-squared:  0.4154 
## F-statistic:  3162 on 3 and 13340 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Spline Model"
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## unlist(new.df[2]) ~ s(unlist(new.df[1]))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   216.16       1.15   187.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(unlist(new.df[1])) 8.357   8.82 1079  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.416   Deviance explained = 41.7%
## GCV =  17675  Scale est. = 17662     n = 13344
## [1] " "
## [1] " "

For accomodations, there is no numerical benefit given by using a model that is more complicated than a linear model as far as R^2, residual standard deviation, or the p-values for any coefficients.

linear_diagnostics(num.df.listings, accommodates, price)
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.77  -68.26  -23.83   38.78  891.78 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        38.4324     2.1582   17.81   <2e-16 ***
## unlist(new.df[1])  34.3962     0.3533   97.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133 on 13342 degrees of freedom
## Multiple R-squared:  0.4154, Adjusted R-squared:  0.4153 
## F-statistic:  9480 on 1 and 13342 DF,  p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## [1] 0.1031962
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

Based on the diagnostics, the residuals appear to show a sort of non-linear trend, however this is not necessarily supported by the other types of models and their summary statistics. As a result, this could be due to a concentration of data some levels of accommodates and limited data at the low and high quartiles. There is, in particular a lack of data at the low end of the number of accommodates, which shows the strongest part of the non-linear trend and so this viual mat be attributed to this. The residuals to increase toward the higher end of accomodations, indicating heteroskedacisity. An adjustment factor, such as location could help to fix this. In general, the residuals do appear to be gaussian, with a slightly longer tail in the positive direction, a bootstrap on the parameters can alleviate this issue. The residuals do appear to have a small amount of correlation as the lag plot has some nonlinearity. The value of the correlation is ~0.1 which could be as a result of the time period during which the data was taken and how prices moved in general across that period. In general, the simplicity of this linear model makes it the best explainer for the relationship between accomodations and price.

###Simple Regression: Price v. Beds

model_summaries(num.df.listings, beds, price)
## [1] "Linear Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -819.51  -77.57  -28.97   38.73  871.03 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        80.6662     1.9517   41.33   <2e-16 ***
## unlist(new.df[1])  47.3026     0.5378   87.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 138.4 on 13342 degrees of freedom
## Multiple R-squared:  0.367,  Adjusted R-squared:  0.3669 
## F-statistic:  7735 on 1 and 13342 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Squared Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 2), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -644.53  -74.47  -26.47   39.51  884.53 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.186  182.19   <2e-16 ***
## poly((unlist(new.df[1])), 2)1 12172.477    137.055   88.81   <2e-16 ***
## poly((unlist(new.df[1])), 2)2 -2230.003    137.055  -16.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 137.1 on 13341 degrees of freedom
## Multiple R-squared:  0.3793, Adjusted R-squared:  0.3792 
## F-statistic:  4076 on 2 and 13341 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Cubic Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 3), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -642.77  -73.81  -26.78   39.22  886.19 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.186 182.222   <2e-16 ***
## poly((unlist(new.df[1])), 3)1 12172.477    137.031  88.830   <2e-16 ***
## poly((unlist(new.df[1])), 3)2 -2230.003    137.031 -16.274   <2e-16 ***
## poly((unlist(new.df[1])), 3)3   324.591    137.031   2.369   0.0179 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 137 on 13340 degrees of freedom
## Multiple R-squared:  0.3796, Adjusted R-squared:  0.3794 
## F-statistic:  2720 on 3 and 13340 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Spline Model"
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## unlist(new.df[2]) ~ s(unlist(new.df[1]))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  216.161      1.185   182.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df   F p-value    
## s(unlist(new.df[1])) 8.847  8.987 914  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.381   Deviance explained = 38.1%
## GCV =  18750  Scale est. = 18736     n = 13344
## [1] " "
## [1] " "
cor(num.df.listings$beds, num.df.listings$accommodates)
## [1] 0.8864173

Looking at the models for the relationship between beds and price, it appears to behave similarly to accomodates which makes sense as there is a strong correlation between beds and accomodates (~0.886), as a result of this colinearity, the model behaves the same with relatively similar summary statistics, as such there are not really benefits to justify a more complicated model beyond the linear.

linear_diagnostics(num.df.listings, beds, price)
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -819.51  -77.57  -28.97   38.73  871.03 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        80.6662     1.9517   41.33   <2e-16 ***
## unlist(new.df[1])  47.3026     0.5378   87.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 138.4 on 13342 degrees of freedom
## Multiple R-squared:  0.367,  Adjusted R-squared:  0.3669 
## F-statistic:  7735 on 1 and 13342 DF,  p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## [1] 0.1123369
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Note:The data has an extreme point of a unit with 55 beds with the second highest being less than 30. As is would be surprising that an airbnb could have this many beds, and would still be incredibly unusual if it did, I am removing this point from the dataset as it strongly affects the diagnostic plots.

Based on the diagnostics, the residuals appear to show a sort of non-linear trend, this is minorly supported by a slight improvement in RSD and R^2 from the squared and cubic models.
However, this improvement is very limited and does not justify making the model more complicated as it does not better explain the relationship.

This could be due to a concentration of data some levels of accomodates and limited data at the high quartile.
This lack of data at the high end of the number of beds, which shows the strongest part of the non-linear trend and so this visual may be attributed to this. The residuals do increase toward the higher end of accomodations, indicating heteroskedacisity. An adjustment factor, could help to explain whether this comes from beds or another factor .
The residuals do appear to be normal in the bulk of the data, following the assumption of gaussian error assumed by the model summary.

The residuals do appear to have a small amount of correlation as the lag plot has some nonlinearity. The value of the correlation is ~0.1 which could be as a result of the time period during which the data was taken and how prices moved in general across that period. In general, the simplicity of this linear model makes it the best simple explainer for the relationship between accomodations and price.

###Simple Regression: Price v. Bedrooms

model_summaries(num.df.listings, bedrooms, price)
## [1] "Linear Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -934.28  -72.57  -23.44   41.43  881.43 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        24.8258     2.2945   10.82   <2e-16 ***
## unlist(new.df[1])  92.7452     0.9608   96.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.5 on 13342 degrees of freedom
## Multiple R-squared:  0.4112, Adjusted R-squared:  0.4111 
## F-statistic:  9317 on 1 and 13342 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Squared Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 2), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -930.08  -72.46  -23.45   41.54  881.54 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.156 187.058   <2e-16 ***
## poly((unlist(new.df[1])), 2)1 12884.569    133.489  96.522   <2e-16 ***
## poly((unlist(new.df[1])), 2)2   -23.008    133.489  -0.172    0.863    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.5 on 13341 degrees of freedom
## Multiple R-squared:  0.4112, Adjusted R-squared:  0.4111 
## F-statistic:  4658 on 2 and 13341 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Cubic Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 3), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -624.93  -73.06  -24.06   42.94  875.94 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.149 188.140   <2e-16 ***
## poly((unlist(new.df[1])), 3)1 12884.569    132.721  97.080   <2e-16 ***
## poly((unlist(new.df[1])), 3)2   -23.008    132.721  -0.173    0.862    
## poly((unlist(new.df[1])), 3)3 -1656.786    132.721 -12.483   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 132.7 on 13340 degrees of freedom
## Multiple R-squared:  0.418,  Adjusted R-squared:  0.4179 
## F-statistic:  3193 on 3 and 13340 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Spline Model"
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## unlist(new.df[2]) ~ s(unlist(new.df[1]))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  216.161      1.146   188.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(unlist(new.df[1])) 6.427  7.148 1359  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.421   Deviance explained = 42.2%
## GCV =  17521  Scale est. = 17511     n = 13344
## [1] " "
## [1] " "
cor(num.df.listings$accommodates, num.df.listings$bedrooms)
## [1] 0.8619873

Looking at the models for the relationship between bedrooms and price, it appears to behave similarly to accomodates which makes sense as there is a strong correlation between beds and accomodates (~0.862), as a result of this colinearity, the model behaves the same with relatively similar summary statistics, as such there are not really benefits to justify a more complicated model beyond the linear.

linear_diagnostics(num.df.listings, bedrooms, price)
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -934.28  -72.57  -23.44   41.43  881.43 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        24.8258     2.2945   10.82   <2e-16 ***
## unlist(new.df[1])  92.7452     0.9608   96.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.5 on 13342 degrees of freedom
## Multiple R-squared:  0.4112, Adjusted R-squared:  0.4111 
## F-statistic:  9317 on 1 and 13342 DF,  p-value: < 2.2e-16
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## [1] 0.1396644
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

All the things that were explained by the relationship with bedrooms appear to be true as a result, please refer to the write up of that model for interpretation of the diagnostics.

###Simple Regression: Price v. Number of Reviews

model_summaries(num.df.listings, number_of_reviews, price)
## [1] "Linear Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -221.18 -114.03  -49.04   51.96  793.67 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       222.4729     1.7291 128.666  < 2e-16 ***
## unlist(new.df[1])  -0.1476     0.0200  -7.381 1.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.6 on 13342 degrees of freedom
## Multiple R-squared:  0.004067,   Adjusted R-squared:  0.003992 
## F-statistic: 54.48 on 1 and 13342 DF,  p-value: 1.662e-13
## 
## [1] " "
## [1] " "
## [1] "Squared Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 2), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -220.91 -114.16  -49.25   51.95  792.76 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.503 143.831  < 2e-16 ***
## poly((unlist(new.df[1])), 2)1 -1281.380    173.608  -7.381 1.67e-13 ***
## poly((unlist(new.df[1])), 2)2   -65.660    173.608  -0.378    0.705    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.6 on 13341 degrees of freedom
## Multiple R-squared:  0.004077,   Adjusted R-squared:  0.003928 
## F-statistic: 27.31 on 2 and 13341 DF,  p-value: 1.457e-12
## 
## [1] " "
## [1] " "
## [1] "Cubic Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 3), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -219.12 -114.82  -49.38   51.89  792.16 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     216.161      1.503 143.858  < 2e-16 ***
## poly((unlist(new.df[1])), 3)1 -1281.380    173.574  -7.382 1.65e-13 ***
## poly((unlist(new.df[1])), 3)2   -65.660    173.574  -0.378   0.7052    
## poly((unlist(new.df[1])), 3)3   430.554    173.574   2.481   0.0131 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.6 on 13340 degrees of freedom
## Multiple R-squared:  0.004537,   Adjusted R-squared:  0.004313 
## F-statistic: 20.26 on 3 and 13340 DF,  p-value: 4.251e-13
## 
## [1] " "
## [1] " "
## [1] "Spline Model"
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## unlist(new.df[2]) ~ s(unlist(new.df[1]))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  216.161      1.502   143.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(unlist(new.df[1])) 3.741    4.6 13.76  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00466   Deviance explained = 0.494%
## GCV =  30128  Scale est. = 30117     n = 13344
## [1] " "
## [1] " "

There is a very limited improvement across all models from the linear model. The linear model is the simplest and as a result that may be the best choice for interpretation. Note, none of the R^2 values for any of the models are (greater than 0.01), as a result this is likely a poor individual predictor of price by itself, but may be important to add to a larger model.

linear_diagnostics(num.df.listings, number_of_reviews, price)
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -221.18 -114.03  -49.04   51.96  793.67 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       222.4729     1.7291 128.666  < 2e-16 ***
## unlist(new.df[1])  -0.1476     0.0200  -7.381 1.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.6 on 13342 degrees of freedom
## Multiple R-squared:  0.004067,   Adjusted R-squared:  0.003992 
## F-statistic: 54.48 on 1 and 13342 DF,  p-value: 1.662e-13
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## [1] 0.1043173
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

The data is highly highly centered to the low end of number of reviews, as such, there is a lot of noise. In general the model behaves similarly to review_scores_rating and so please refer to that for description of the diagnostics

###Simple Regression: Price v. Review Score Rating

model_summaries(num.df.listings, review_scores_rating, price)
## [1] "Linear Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -219.83 -114.23  -51.83   51.07  800.66 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        108.373     15.231   7.115 1.17e-12 ***
## unlist(new.df[1])   22.491      3.162   7.112 1.20e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.6 on 13342 degrees of freedom
## Multiple R-squared:  0.003777,   Adjusted R-squared:  0.003702 
## F-statistic: 50.58 on 1 and 13342 DF,  p-value: 1.204e-12
## 
## [1] " "
## [1] " "
## [1] "Squared Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 2), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.99 -112.93  -51.90   50.67  823.64 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    216.161      1.501 144.013  < 2e-16 ***
## poly((unlist(new.df[1])), 2)1 1234.805    173.389   7.122 1.12e-12 ***
## poly((unlist(new.df[1])), 2)2 1066.296    173.389   6.150 7.98e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.4 on 13341 degrees of freedom
## Multiple R-squared:  0.006593,   Adjusted R-squared:  0.006444 
## F-statistic: 44.27 on 2 and 13341 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Cubic Model"
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ poly((unlist(new.df[1])), 3), 
##     data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -227.35 -112.75  -52.53   50.83  824.99 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      216.2        1.5 144.068  < 2e-16 ***
## poly((unlist(new.df[1])), 3)1   1234.8      173.3   7.124 1.10e-12 ***
## poly((unlist(new.df[1])), 3)2   1066.3      173.3   6.152 7.87e-10 ***
## poly((unlist(new.df[1])), 3)3    579.8      173.3   3.345 0.000825 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.3 on 13340 degrees of freedom
## Multiple R-squared:  0.007425,   Adjusted R-squared:  0.007202 
## F-statistic: 33.26 on 3 and 13340 DF,  p-value: < 2.2e-16
## 
## [1] " "
## [1] " "
## [1] "Spline Model"
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## unlist(new.df[2]) ~ s(unlist(new.df[1]))
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    216.2        1.5   144.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(unlist(new.df[1])) 4.809  5.777 18.86  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00805   Deviance explained = 0.841%
## GCV =  30028  Scale est. = 30015     n = 13344
## [1] " "
## [1] " "

There is a very limited improvement across all models from the linear model. The linear model is the simplest and as a result that may be the best choice for interpretation. Note, none of the R^2 values for any of the models are (greater than 0.01), as a result this is likely a poor individual predictor of price by itself, but may be important to add to a larger model.

linear_diagnostics(num.df.listings, review_scores_rating, price)
## 
## Call:
## lm(formula = unlist(new.df[2]) ~ unlist(new.df[1]), data = new.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -219.83 -114.23  -51.83   51.07  800.66 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        108.373     15.231   7.115 1.17e-12 ***
## unlist(new.df[1])   22.491      3.162   7.112 1.20e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.6 on 13342 degrees of freedom
## Multiple R-squared:  0.003777,   Adjusted R-squared:  0.003702 
## F-statistic: 50.58 on 1 and 13342 DF,  p-value: 1.204e-12
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## [1] 0.09982595
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

The data is highly highly centered to the high end of review scores, as such, there is a lot of noise elsewhere.

From a model perspective, the residuals do have an average very close to zero, as would be expected from a good model. There is a small correlation between lag values of (~0.1) a low correlation is to be expected, and so this is not particularly concerning. The error does seem homoskedastic with the error at each point in the model hovering around the mean. The error is far from normal with a strong bias to the positive tail. If this were used as an individual model, boostrapping could be used to help limit any misinterpretation from the model summary values.

###Numeric Parameter Correlation Model/Colinearity Reference

my_fn <- function(data, mapping, ...){
  p <- ggplot(data = num.df.listings, mapping = mapping) + 
    geom_point() + 
    geom_smooth(formula = y ~ x, method=lm, fill="blue", color="blue", ...)
  p
}


ggpairs(num.df.listings, title="Numeric Metric Correlation", lower = list(continuous = my_fn))